{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import malaya\n",
    "from tqdm import tqdm\n",
    "from unidecode import unidecode\n",
    "import random\n",
    "from glob import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from malaya.text.vectorizer import SkipGramCountVectorizer\n",
    "\n",
    "stopwords = malaya.text.function.get_stopwords()\n",
    "bow = CountVectorizer(\n",
    "    ngram_range = (1, 3),\n",
    "    stop_words = stopwords,\n",
    "    lowercase = False,\n",
    ")\n",
    "\n",
    "stopwords = malaya.text.function.get_stopwords()\n",
    "skip_bow = SkipGramCountVectorizer(\n",
    "    ngram_range = (1, 3),\n",
    "    stop_words = stopwords,\n",
    "    lowercase = False,\n",
    "    skip = 2\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "13"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files = sorted(glob('meta_*.json.filtered.translated'))\n",
    "len(files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'meta_Cell_Phones_and_Accessories.json.filtered.translated'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100046it [00:02, 49158.78it/s]\n",
      "16041it [00:01, 15423.25it/s]\n",
      "12299it [00:00, 54132.92it/s]\n",
      "44679it [00:00, 57923.72it/s]\n",
      "50632it [00:02, 17477.19it/s]\n",
      "100000it [00:06, 14285.84it/s]\n",
      "100000it [00:06, 15188.76it/s]\n",
      "100000it [00:06, 16384.58it/s]\n",
      "10813it [00:00, 47476.29it/s]\n",
      "26790it [00:01, 20997.11it/s]\n",
      "100000it [00:08, 11836.38it/s]\n",
      "100000it [00:04, 20350.18it/s]\n",
      "84822it [00:02, 32959.26it/s]\n"
     ]
    }
   ],
   "source": [
    "selected = []\n",
    "for f in files:\n",
    "    selected_ = []\n",
    "    with open(f) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l)\n",
    "            splitted = data['translated'].split()\n",
    "            if 20 < len(splitted) < 300 and len(set(splitted)) > 10:\n",
    "                selected_.append(data)\n",
    "    selected.extend(random.sample(selected_, min(len(selected_), 40000)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "383303"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(selected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'EZOPower Battery Charger for Samsung GALAXY S2 / SII I9100 / Galaxy S II SGH-i777 / Exhibit 4G T759'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected[0]['data']['title']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def simple_cleaning(string):\n",
    "    return re.sub(r'[ ]+', ' ', unidecode(string).replace('\\n', ' ')).strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['<br /> Design compact with collapsible plug for easy storage and portability <br /> Automatik switches to safe mode when full charge is detected to prevent overcharging <br /> Built in short circuit protection <br /> Input',\n",
       " 'USB Charging Port</b> <br /> Recharge bateri spare directly in wall outlet with this compact EZOPower charger <br /> USB port allows you to charge 2nd device simultaneously',\n",
       " '<br /> <br /> <b>Compatible with Samsung Galaxy S2/SII Exhibit SGH-T679/SGH-T759/SGH-i777/i9100 series',\n",
       " '<b>EZOPower Samsung GALAXY S2 Battery Wall Charger',\n",
       " '2V  350mA <br /> USB Output']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t = selected[0]['translated']\n",
    "keywords_rake = malaya.keyword.extractive.rake(t,\n",
    "                                          top_k = random.randint(5, 12))\n",
    "already = set()\n",
    "filtered = []\n",
    "for k in keywords_rake:\n",
    "    k = k[1]\n",
    "    if k.lower() in already:\n",
    "        continue\n",
    "    else:\n",
    "        already.add(k.lower())\n",
    "        filtered.append(k)\n",
    "\n",
    "filtered "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "months = {'<li>', '</li>', '</ul>', '<p>', '<b>', '</b>', '<br>', '</br>', '</LI></UL>'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████| 383303/383303 [14:38<00:00, 436.14it/s]\n"
     ]
    }
   ],
   "source": [
    "before, after = [], []\n",
    "for i in tqdm(range(len(selected))):\n",
    "    t = selected[i]['translated']\n",
    "    try:\n",
    "        keywords_rake = malaya.keyword.extractive.rake(t,\n",
    "                                                  top_k = random.randint(5, 12))\n",
    "        \n",
    "        keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) > 1 and len(k[1]) > 10 \\\n",
    "                        and '<br' not in k[1] and len(set(k[1].lower().replace('-', '').split()) & months) == 0 \\\n",
    "                        and len(k[1]) < 150 and not any([m in k[1].lower() for m in months])]\n",
    "        \n",
    "        already = set()\n",
    "        filtered = []\n",
    "        for k in keywords_rake:\n",
    "            if k.lower() in already:\n",
    "                continue\n",
    "            else:\n",
    "                already.add(k.lower())\n",
    "                filtered.append(k)\n",
    "        \n",
    "        if len(filtered) >= random.randint(2, 4):\n",
    "            before.append(filtered)\n",
    "            after.append(t)\n",
    "    except Exception as e:\n",
    "        print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('product-description.json', 'w') as fopen:\n",
    "    json.dump({'before': before, 'after': after}, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
